November 25, 2017
library(tidyverse) # data proses, summary, table library(reshape2) # data proses library(plotly) # interaktif grafik library(knitr) # rmarkdown library(kableExtra) # güzel talo output library(sjPlot) # ozel göserimler
Başlangıç https://www.statmethods.net/stats/descriptives.html
Külliyat http://r4ds.had.co.nz/index.html
– Amaç
– Hedef Kitle
– Soru Formu
– Örnekleme
– Data Proses
– Analiz
– Raporlama
https://en.wikipedia.org/wiki/Margin_of_error
\[ z * \sqrt\frac{p * q}{n} \]
\[ z * \sqrt\frac{p * q}{n} \]
hata_payi <- function(Uzay = 55000000, Orneklem) {
p <- 0.5
q <- 1 - p
if ( Orneklem / Uzay < 0.05) { # örneklem < %5Uzay
MyHataPayi <- 1.96 * sqrt((p * q) / Orneklem)
} else { # örneklem > %5Uzay
MyHataPayi <- (1.96 * sqrt((p * q) / Orneklem)) *
sqrt( (Uzay - Orneklem) / (Uzay - 1))
}
MyHataPayi
}
orn <- seq(50,1000, by = 10)
hp <- sapply(orn, function(x) hata_payi(Orneklem = x) )
df <- data.frame(orn, hp)
gg <- ggplot(df, aes(x = orn, y = hp )) +
geom_point() + xlab("Örneklem") + ylab("Hata Payı") +
labs(title = "Örnklemlem büyüklüğü ile hata payı ilişkisi"
, subtitle = "Ters logaritmik ilişki vardir")
ggplotly(gg + theme_minimal())
Gerçek çalışmalardan örnek sorular üzerinden geçelim
library(readxl)
dat <- read_xlsx("data/ornek_data.xlsx", sheet = "data", skip = 1)
names(dat)
## [1] "anketid" "gm1" "sys1" "sys2" "sys3" "sys4" "sys5" ## [8] "sys6" "oneri" "k1" "t4" "s1a" "s1b" "s1c" ## [15] "s1d" "s1e" "s1f" "s1g" "c5d" "c6" "m2_01" ## [22] "m2_02" "m2_03" "m2_04" "m2_05" "m2_06" "m2_07" "m2_08" ## [29] "m2_09" "m2_10" "m2_11" "m2_12" "m2_13" "m2_14" "m2_15" ## [36] "m2_16" "k_01" "k_02" "k_03" "k_04" "k_05" "k_06" ## [43] "k_07" "k_08" "k_09" "k_10" "k_11" "k_12" "k_13" ## [50] "bin1" "bin2" "bin3" "num1" "num2"
library(readxl)
dat <- read.table(file = "clipboard", sep = "\t"
, header=TRUE,stringsAsFactors = FALSE)
xfac <- c("gm1", "sys1", "sys2", "sys3", "sys4", "sys5", "c5d","c6" )
dat[,xfac] <- dat[,xfac] %>%
lapply(function(x) factor(x, exclude = "99", ordered = T) )
dat$sys6 <- factor(dat$sys6, levels=c(1,2,99)
, labels=c("Evet yaşadım"
,"Hayır yaşamadım"
,"Hatırlamıyorum") )
dat$k1 <- factor(dat$k1, levels=c(1:3)
, labels=c("Satış temsilcisi"
,"Çağrı merkezi"
,"İnternet şube") )
dat$t4 <- factor(dat$t4, levels=c(2,3)
, labels=c("25-29", "30-40"), ordered = T )
xmar <- c("s1a", "s1b", "s1c", "s1d", "s1e", "s1f", "s1g")
dat[,xmar] <- dat[,xmar] %>%
lapply(function(x)
factor(x, levels=c(1:13)
, labels=c('Aroma' , 'Cappy' , 'Dimes'
, 'Exotic' , 'Tamek' , 'Pinar'
, 'Halk' , 'Ulker Icim' , 'Juss'
, 'Meysu' , 'Meyoz' , 'Jucy' , 'Uludag') ) )
xmem <- c('m2_01' , 'm2_02' , 'm2_03' , 'm2_04' , 'm2_05'
, 'm2_06' , 'm2_07' , 'm2_08' , 'm2_09' , 'm2_10'
, 'm2_11' , 'm2_12' , 'm2_13' , 'm2_14' , 'm2_15'
, 'm2_16')
dat[,xmem] <- dat[,xmem] %>%
lapply(function(x)
factor(x, levels=c(1:5),
labels=c('Kesinlikle Katılmıyorum'
, 'Katılmıyorum'
, 'Ne Katılıyorum Ne Katılmıyor'
, 'Katılıyorum'
, 'Kesinlikle Katılıyorum')
, ordered = T ) )
str(dat[,1:20])
## Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 20 variables: ## $ anketid: num 1 2 3 4 5 6 7 8 9 10 ... ## $ gm1 : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 3 6 2 10 4 6 7 6 6 5 ... ## $ sys1 : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 7 8 7 3 1 2 8 4 NA 10 ... ## $ sys2 : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 10 NA 8 4 3 9 6 4 2 6 ... ## $ sys3 : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 9 7 4 6 7 9 8 3 8 4 ... ## $ sys4 : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 3 5 2 10 1 9 9 10 3 NA ... ## $ sys5 : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 6 10 8 8 2 4 9 4 7 NA ... ## $ sys6 : Factor w/ 3 levels "Evet yasadim",..: 1 1 2 2 1 1 2 2 3 1 ... ## $ oneri : chr "perferendis doloribus asperiores repellat" "optio cumque nihil impedit quo" "saepe eveniet ut et voluptates" NA ... ## $ k1 : Factor w/ 3 levels "Satis temsilcisi",..: 1 1 3 1 1 3 3 3 1 3 ... ## $ t4 : Ord.factor w/ 2 levels "25-29"<"30-40": 2 2 2 2 1 1 2 2 2 1 ... ## $ s1a : Factor w/ 13 levels "Aroma","Cappy",..: 12 10 13 6 5 13 13 12 10 6 ... ## $ s1b : Factor w/ 13 levels "Aroma","Cappy",..: NA 1 2 1 2 2 2 2 NA 1 ... ## $ s1c : Factor w/ 13 levels "Aroma","Cappy",..: 3 3 4 4 4 4 4 3 3 3 ... ## $ s1d : Factor w/ 13 levels "Aroma","Cappy",..: 3 10 2 1 4 13 4 2 NA 6 ... ## $ s1e : Factor w/ 13 levels "Aroma","Cappy",..: 12 1 13 6 5 4 2 12 10 6 ... ## $ s1f : Factor w/ 13 levels "Aroma","Cappy",..: 12 3 13 1 4 4 13 3 NA 6 ... ## $ s1g : Factor w/ 13 levels "Aroma","Cappy",..: 12 3 2 4 5 4 4 3 10 6 ... ## $ c5d : Ord.factor w/ 8 levels "1"<"2"<"3"<"4"<..: 3 5 1 3 5 3 7 1 4 1 ... ## $ c6 : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 3 5 1 1 4 2 3 1 4 4 ...
str(dat[,21:40])
## Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 20 variables: ## $ m2_01: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 4 1 3 5 4 2 3 3 2 3 ... ## $ m2_02: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 2 3 3 2 3 4 2 4 4 2 ... ## $ m2_03: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 4 3 4 1 5 5 4 2 2 2 ... ## $ m2_04: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 4 3 2 3 3 3 5 2 5 5 ... ## $ m2_05: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 4 4 4 3 3 2 2 1 4 4 ... ## $ m2_06: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 4 3 4 4 2 4 5 3 2 2 ... ## $ m2_07: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 3 3 4 3 3 2 2 4 3 4 ... ## $ m2_08: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 2 3 2 2 3 3 4 4 2 3 ... ## $ m2_09: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 3 3 4 2 3 4 3 5 2 3 ... ## $ m2_10: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 4 2 3 2 2 3 5 2 3 3 ... ## $ m2_11: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 2 2 4 3 2 3 4 4 3 2 ... ## $ m2_12: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 2 2 3 3 3 2 2 2 3 4 ... ## $ m2_13: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 3 2 3 2 3 2 3 4 4 3 ... ## $ m2_14: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 4 3 3 4 3 3 2 3 4 3 ... ## $ m2_15: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 3 5 2 3 3 3 3 4 2 3 ... ## $ m2_16: Ord.factor w/ 5 levels "Kesinlikle Katilmiyorum"<..: 2 4 2 2 3 4 3 3 2 4 ... ## $ k_01 : chr "33" "11,24,0" "7,11,24,33" "7,11,24,33" ... ## $ k_02 : chr "11,0" "7,24,0" "7,11,0" "24,0" ... ## $ k_03 : chr "7,11,0" "0" "0" "7,33" ... ## $ k_04 : chr "7,11,33" "11,33" "7,24,33" "7,11,24,33" ...
with(dat, table(sys6, k1)) %>% kable()
| Satis temsilcisi | Çagri merkezi | Internet sube | |
|---|---|---|---|
| Evet yasadim | 20 | 14 | 14 |
| Hayir yasamadim | 17 | 16 | 16 |
| Hatirlamiyorum | 3 | 0 | 0 |
dat %>% group_by(sys6, k1) %>% summarise(say = n()) %>% spread(k1, say) %>% kable(format = "markdown")
| sys6 | Satis temsilcisi | Çagri merkezi | Internet sube |
|---|---|---|---|
| Evet yasadim | 20 | 14 | 14 |
| Hayir yasamadim | 17 | 16 | 16 |
| Hatirlamiyorum | 3 | NA | NA |
dat %>% dplyr::select(anketid, s1a:s1g) %>% melt(id.vars = "anketid", na.rm = T) %>% group_by(variable) %>% mutate(s1baz = n_distinct(anketid) ) %>% group_by(variable, value) %>% summarise(s1_pay = n_distinct(anketid) / mean(s1baz) * 100) %>% mutate(s1_pay = round(s1_pay,1)) %>% spread(variable, s1_pay) -> xtables1
## # A tibble: 13 x 8 ## value s1a s1b s1c s1d s1e s1f s1g ## * <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 Aroma NA 56.2 NA 15.6 14.6 15.7 11.2 ## 2 Cappy NA 43.8 NA 12.2 12.4 7.9 10.1 ## 3 Dimes NA NA 36 12.2 12.4 16.9 13.5 ## 4 Exotic NA NA 64 22.2 23.6 24.7 27.0 ## 5 Halk 14 NA NA 4.4 3.4 4.5 6.7 ## 6 Jucy 13 NA NA 4.4 9.0 5.6 4.5 ## 7 Juss 18 NA NA 10.0 6.7 6.7 6.7 ## 8 Meyoz 8 NA NA 2.2 3.4 3.4 4.5 ## 9 Meysu 13 NA NA 6.7 4.5 2.2 4.5 ## 10 Pinar 9 NA NA 5.6 3.4 2.2 2.2 ## 11 Tamek 6 NA NA NA 2.2 3.4 4.5 ## 12 Ulker Icim 4 NA NA 1.1 1.1 1.1 1.1 ## 13 Uludag 15 NA NA 3.3 3.4 5.6 3.4
library(sjPlot) ; library(sjmisc) ; library(knitr)
dat %>%
dplyr::select(m2_01:m2_16) %>%
sjt.stackfrq(show.n = TRUE, show.total = TRUE
, show.skew = TRUE
, show.kurtosis = TRUE
, altr.row.col = TRUE)
library(sjPlot) ; library(sjmisc) ; library(knitr) dat %>% dplyr::select(m2_01:m2_16) %>% sjp.likert(sort.frq = "pos.asc", geom.colors = "RdBu")
z, t, chi-square ve diğer.
En basit anlatımla Hipotez testi, dagilimlarin birbirini kesip kesmediğini analiz etmektir
Ortalamalar Eşittir : \[ H_o: \mu = \mu_{0} \] \[ H_a: \mu \neq \mu_{0} \]
Ortalama 37'ye eşittir : \[ H_o: \mu = 37 \] \[ H_a: \mu \neq 37 \]
Test istatistiği : \[ z = \frac{\bar{x} - \mu_0}{\sigma/\sqrt{n}} \]
sjt.xtab(dat$sys6, dat$t4, show.row.prc = TRUE,
show.cell.prc = TRUE, show.legend = TRUE)
# library(mosaic) xt <- with(dat, t.test(num1~bin1)) xt
## ## Welch Two Sample t-test ## ## data: num1 by bin1 ## t = -1.0682, df = 97.997, p-value = 0.2881 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -0.6156922 0.1848093 ## sample estimates: ## mean in group 0 mean in group 1 ## 3.051013 3.266455
# xpt(q=xt$statistic, lower.tail=FALSE, df=99)
xt <- with(dat, t.test(num1,num2)) xt
## ## Welch Two Sample t-test ## ## data: num1 and num2 ## t = 2.1186, df = 195.81, p-value = 0.03539 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## 0.01987015 0.55520749 ## sample estimates: ## mean of x mean of y ## 3.158734 2.871195
# xpt(q=xt$statistic, lower.tail=TRUE, df=99)
xt <- with(dat, t.test(num1,num2, paired = T)) xt
## ## Paired t-test ## ## data: num1 and num2 ## t = 2.0839, df = 99, p-value = 0.03975 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## 0.01375288 0.56132476 ## sample estimates: ## mean of the differences ## 0.2875388
xt <- with(dat, t.test(num2,mu=3)) xt
## ## One Sample t-test ## ## data: num2 ## t = -1.4192, df = 99, p-value = 0.159 ## alternative hypothesis: true mean is not equal to 3 ## 95 percent confidence interval: ## 2.691116 3.051275 ## sample estimates: ## mean of x ## 2.871195
# xpt(q=xt$statistic, lower.tail=TRUE, df=99)
fit <- aov(num1 ~ factor(t4), data=dat) # display Type I ANOVA table summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F) ## factor(t4) 1 0.42 0.4217 0.412 0.523 ## Residuals 98 100.40 1.0245
layout(matrix(c(1,2,3,4),2,2)) plot(fit)
# Multiple Comparisons TukeyHSD(fit) # where fit comes from aov()
## Tukey multiple comparisons of means ## 95% family-wise confidence level ## ## Fit: aov(formula = num1 ~ factor(t4), data = dat) ## ## $`factor(t4)` ## diff lwr upr p adj ## 30-40-25-29 0.130295 -0.2727279 0.5333178 0.5226521
with(dat, wilcox.test(num1~bin1))
## ## Wilcoxon rank sum test with continuity correction ## ## data: num1 by bin1 ## W = 1144, p-value = 0.467 ## alternative hypothesis: true location shift is not equal to 0
with(dat, wilcox.test(num1,num2) )
## ## Wilcoxon rank sum test with continuity correction ## ## data: num1 and num2 ## W = 5715, p-value = 0.08085 ## alternative hypothesis: true location shift is not equal to 0
with(dat, wilcox.test(num1,num2,paired=TRUE) )
## ## Wilcoxon signed rank test with continuity correction ## ## data: num1 and num2 ## V = 3193, p-value = 0.02173 ## alternative hypothesis: true location shift is not equal to 0
with(dat, kruskal.test(num1~factor(t4)))
## ## Kruskal-Wallis rank sum test ## ## data: num1 by factor(t4) ## Kruskal-Wallis chi-squared = 0.56828, df = 1, p-value = 0.4509
with(dat, friedman.test(num1~t4|sys1))